Basa data:
Balsamorhiza sagittata
Balsamorhiza hookeri
BAHO3 x BASA hybrid
Lupinus wyethii (Outgroup)
Located in:
Raw data located in:
/archive/parchman_lab/rawdata_to_backup/BASA_CHDO_rawNOVASEQ/Library2-BASA_S2_L002_R1_001.fastq.gz
Cleaned and demultiplexed .fastq files by individual:
/working/tfaske/balsam/demult/fastq (BASA)
/working/tfaske/balsam/demult/outgroup_fastq (outgroup)
Generating the reference with dDocent:
mkdir refOpt (place here 4 individuals per pop)
./ReferenceOpt.sh 4 10 4 10 SE 5 0.95 0.99 0.005
#####
R
library(ggplot2)
data.table <- read.table("kopt.data", header = FALSE, col.names= c("k1","k2","Similarity", "Contigs"))
data.table$K1K2 <- paste(data.table$k1, data.table$k2, sep=",")
df=data.frame(data.table)
df$K1K2 <- as.factor(df$K1K2)
p <- ggplot(df, aes(x=Similarity, y=Contigs, group=K1K2)) + scale_x_continuous(breaks=seq(0.8,0.98,0.01)) + geom_line(aes(colour = K1K2))
p
ggsave("kvalues.pdf",p,height=8,width = 10,units = 'in')
#####
Run dDocent on this subset with the correct assembly parameters (skipping mapping and snp calling)
./RefMapOpt.sh 4 6 4 6 0.975 SE 10
./compress.sh (compress intermediate files)
Move the reads and reference files to the main directory
Run dDocent on the full data set, skipping trimming, assembly, snp calling
SNP calling:
Run ./bwa_sort.sh
INDS=($(for i in /working/mascaro/basa/outgroup_2/*.F.fq.gz; do echo $(basename ${i%.F.fq.gz*}); done))
module load bwa/0.7.8
module load samtools/1.3
for IND in ${INDS[@]};
do
# declare variables
REF=/working/mascaro/basa/outgroup_2/reference.fasta
FORWARD=/working/mascaro/basa/outgroup_2/${IND}.F.fq.gz
OUTPUT=/working/mascaro/basa/outgroup_2/assembly/${IND}_sort.bam
# then align and sort
echo "Aligning $IND with bwa"
bwa mem -M -t 10 $REF $FORWARD \
| samtools view -b | \
samtools sort -T ${IND} > $OUTPUT
done
###bcftools
Run ./bcftools.sh
REF=/working/mascaro/basa/outgroup_2/reference.fasta
module load bcftools/1.9
bcftools mpileup -a AD,DP,SP -Ou -f $REF \
./assembly/*_sort.bam | bcftools call -f GQ,GP \
-mO z -o ./basa.vcf.gz
Filtering:
####
#The obtained vcf file:
#Final.recode.vcf:
# Statistics with Vcftools:
vcftools --gzvcf basa.vcf.gz --site-quality --out /working/mascaro/basa/outgroup_2/filter/quality
vcftools --gzvcf basa.vcf.gz --freq2 --out /working/mascaro/basa/outgroup_2/filter --max-alleles 2
vcftools --gzvcf basa.vcf.gz --depth --out /working/mascaro/basa/outgroup_2/filter/meandepthind
vcftools --gzvcf basa.vcf.gz --site-mean-depth --out /working/mascaro/basa/outgroup_2/filter/meandepsite
vcftools --gzvcf basa.vcf.gz --missing-indv --out /working/mascaro/basa/outgroup_2/filter/missing
vcftools --gzvcf basa.vcf.gz --missing-site --out /working/mascaro/basa/outgroup_2/filter/missingsite
##### Examining statistics in R
R
library(ggplot2)
library(tidyverse)
var_qual <- read_delim("quality.lqual", delim = "\t",col_names = c("chr", "pos", "qual"), skip = 1)
a <- ggplot(var_qual, aes(qual)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("quality.pdf",a,height=8,width = 10,units = 'in')
var_depth <- read_delim("meandepthind.idepth", delim = "\t", col_names = c("chr", "pos", "mean_depth", "var_depth"), skip =1)
a <- ggplot(var_depth, aes(mean_depth)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_depth$mean_depth)
a + theme_light() + xlim(0, 100)
ggsave("meandepth_ind.pdf",a,height=8,width = 10,units = 'in')
var_miss <- read_delim("missingsite.lmiss", delim = "\t",col_names = c("chr", "pos", "nchr", "nfiltered", "nmiss","fmiss"), skip = 1)
a <- ggplot(var_miss, aes(fmiss)) + geom_density(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
summary(var_miss$fmiss)
ggsave("missing.pdf",a,height=8,width = 10,units = 'in')
ind_depth <- read_delim("meandepsite.ldepth.mean", delim = "\t", col_names = c("ind", "nsites", "depth"), skip =1)
a <- ggplot(ind_depth, aes(depth)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("meandepth_site.pdf",a,height=8,width = 10,units = 'in')
ind_miss <- read_delim("missing.imiss", delim = "\t",col_names = c("ind", "ndata", "nfiltered", "nmiss","fmiss"), skip = 1)
a <- ggplot(ind_miss, aes(fmiss)) + geom_histogram(fill = "dodgerblue1", colour = "black", alpha = 0.3)
a + theme_light()
ggsave("missing.pdf",a,height=8,width = 10,units = 'in')
#######
# Filtering:
vcftools --vcf basa.vcf.gz --maf 0.05 --max-missing 0.6 --minQ 30 --min-meanDP 5 --max-meanDP 80 --minDP 5 --maxDP 80 --recode --out basa_filtered
awk '$5 > 0.5' missing.imiss | cut -f1 > lowDP.indv
vcftools --vcf basa_filtered.vcf.recode.vcf --remove lowDP.indv --recode --recode-INFO-all --out variants_clean
vcftools --vcf variants_clean.recode.vcf --out basa_filtered_final --recode --remove-filtered-all
# Statistics with Vcftools:
vcftools --gzvcf basa_filtered_final.recode.vcf --site-quality --out quality
vcftools --gzvcf basa_filtered_final.recode.vcf --freq2 --out $OUT --max-alleles 2
vcftools --gzvcf basa_filtered_final.recode.vcf --depth --out meandepthind
vcftools --gzvcf basa_filtered_final.recode.vcf --site-mean-depth --out meandepsite
vcftools --gzvcf basa_filtered_final.recode.vcff --missing-indv --out missing
vcftools --gzvcf basa_filtered_final.recode.vcf --missing-site --out missingsite
# Filtering again:
vcftools --vcf basa_filtered_final.recode.vcf --maf 0.05 --max-missing 0.75 --m
inQ 30 --min-meanDP 10 --max-meanDP 80 --minDP 10 --maxDP 80 --recode --out basa_final2
# Statistics with Vcftools:
vcftools --gzvcf basa_filtered2.recode.vcf --site-quality --out quality
vcftools --gzvcf basa_filtered2.recode.vcf --freq2 --out $OUT --max-alleles 2
vcftools --gzvcf basa_filtered2.recode.vcf --depth --out meandepthind
vcftools --gzvcf basa_filtered2.recode.vcf --site-mean-depth --out meandepsite
vcftools --gzvcf basa_filtered2.recode.vcf --missing-indv --out missing
vcftools --gzvcf basa_filtered2.recode.vcf --missing-site --out missingsite
After filtering: 27621 loci and 379 individuals
Genetic diversity statistics:
Overall, excess of heterozygotes (Ho>He; negative Fis values), high nucleotide diversity, and low Fst values.
AMOVA showing only 22 % of the variance explained by population.
library(vcfR)
library("adegenet")
library("hierfstat")
library("pegas")
basa.vcf <- read.vcfR("basa_final.vcf")
basa.vcf
##Fis, Fst
my_genind <- vcfR2genind(basa.vcf)
x<- my_genind
pops <- as.factor(c(pops))
#Population specific Fis:
myData.hfstat <- genind2hierfstat(my_genind, pop = pops)
basicstat <- basic.stats(myData.hfstat, diploid = TRUE, digits = 4)
basicstat$Fis
write.csv(basicstat$Fis, "Fis.csv")
##Bootstrapping over loci of population's Fis
boot.ppfis(myData.hfstat)
#Nei's Pairwise FSTs:
x <- genet.dist(myData.hfstat,diploid=TRUE,method="Ds")# Nei’s standard genetic distance
fst <- as.matrix(x)
write.table(fst, "Fst.txt")
##Bootstrapping over loci of pairwise Fst
#boot.ppfst(myData.hfstat)
basicstat$Ho #observed
write.csv(basicstat$Ho, "HO.csv")
basicstat$Hs #expected
write.csv(basicstat$Hs, "Hs.csv")
basicstat
###########################################
##vcftools Pi and TajimaD
#!/bin/sh
# .vcf file
# .pop file (unique names of pops, one per line)
# .map file (individual to population mapping file — 2 columns)
#cat basa.pop | while read line;
#do
#grep "$line" basa.map > $line.pop
#done
#for p in *.pop
#do
#vcftools --vcf basa_final.vcf --keep $p --site-pi --out $p
#done
#for p in *.pop
#do
#vcftools --vcf basa_final.vcf --keep $p --TajimaD 100 --out $p
#done
#AMOVA:
library(vcfR)
library("adegenet")
library("hierfstat")
library("pegas")
library("poppr")
library("magrittr")
library(ape)
# vcf to genind
BASA.VCF <- read.vcfR("basa.vcf")
my_genind <- vcfR2genind(BASA.VCF)
### Give a data set with stratification (individual, populations, subpopulations..)
file.hier = read.table("ind_pop.txt", header=T)
strata(my_genind) = file.hier
my_genind
## amova with stratifications
amova.res.95 = poppr.amova(acth.hier.strat, ~pop/subpop, cutoff=0.95)
amova.res.95
write.table(amova.res.95$results, sep = ",", file = "results_amova.csv")
write.table(amova.res.95$componentsofcovariance, sep = ",", file = "Components_covariance.csv")
write.table(amova.res.95$statphi, sep = ",", file = "phi.csv")
## To test if populations are significantly different
amova.test.95 = randtest(amova.res.95)
amova.test.95
| Pop | He | Ho | Fis | π | TajimaD |
|---|---|---|---|---|---|
| TM | 0.284 | 0.428 | -0.385 | 0.241 | 0.313 |
| KB | 0.288 | 0.430 | -0.357 | 0.239 | 0.339 |
| CC | 0.291 | 0.432 | -0.354 | 0.246 | 0.275 |
| KA | 0.291 | 0.432 | -0.372 | 0.249 | 0.295 |
| RP | 0.290 | 0.432 | -0.364 | 0.251 | 0.272 |
| BH | 0.289 | 0.434 | -0.380 | 0.231 | 0.356 |
| LL | 0.288 | 0.434 | -0.385 | 0.242 | 0.286 |
| CF | 0.303 | 0.435 | -0.328 | 0.265 | 0.222 |
| CL | 0.290 | 0.437 | -0.377 | 0.246 | 0.333 |
| SP | 0.293 | 0.437 | -0.373 | 0.251 | 0.271 |
| EC | 0.295 | 0.442 | -0.363 | 0.253 | 0.265 |
| GA | 0.298 | 0.442 | -0.358 | 0.262 | 0.242 |
| AN | 0.298 | 0.444 | -0.358 | 0.258 | 0.308 |
| SL | 0.300 | 0.444 | -0.358 | 0.261 | 0.211 |
| SC | 0.300 | 0.444 | -0.349 | 0.265 | 0.227 |
| LC | 0.299 | 0.445 | -0.362 | 0.251 | 0.351 |
| AS | 0.300 | 0.445 | -0.352 | 0.271 | 0.284 |
| BB | 0.305 | 0.446 | -0.339 | 0.268 | 0.262 |
| WC | 0.304 | 0.447 | -0.342 | 0.271 | 0.272 |
| JV | 0.304 | 0.448 | -0.346 | 0.264 | 0.285 |
| SD | 0.302 | 0.449 | -0.355 | 0.271 | 0.299 |
| KM | 0.301 | 0.449 | -0.382 | 0.281 | 0.367 |
| RC | 0.303 | 0.450 | -0.351 | 0.272 | 0.265 |
| RL | 0.304 | 0.450 | -0.348 | 0.272 | 0.283 |
| GJ | 0.301 | 0.451 | -0.362 | 0.269 | 0.263 |
| MM | 0.303 | 0.452 | -0.365 | 0.272 | 0.311 |
| GB | 0.307 | 0.453 | -0.347 | 0.277 | 0.242 |
| CN | 0.308 | 0.454 | -0.347 | 0.272 | 0.261 |
| SY | 0.306 | 0.455 | -0.358 | 0.268 | 0.274 |
| MT | 0.303 | 0.456 | -0.372 | 0.276 | 0.324 |
| LM | 0.306 | 0.456 | -0.356 | 0.279 | 0.222 |
| QG | 0.307 | 0.457 | -0.359 | 0.271 | 0.311 |
.inline-btns p {
display: inline;
}
library(downloadthis)
Fst_results <- read.csv("/home/caro/Escritorio/BASA_2/basa_Rmd/Fst.csv")
d_btn <- Fst_results %>%
download_this(
path = system.file("/home/caro/Escritorio/BASA_2/basa_Rmd/", package = "downloadthis"),
output_name = "Fst values",
output_extension = ".csv",
button_label = "Fst",
button_type = "default",
has_icon = TRUE,
icon = "fa fa-save"
)
| X | AN | AS | BB | BH | CC | CF | CL | CN | EC | GA | GB | GJ | JV | KA | KB | KM | LC | LL | LM | MM | MT | QG | RC | RL | RP | SC | SD | SL | SP | SY | TM | WC |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| AN | 0.0000000 | 0.0212699 | 0.0200213 | 0.0346840 | 0.0327182 | 0.0267836 | 0.0246469 | 0.0189064 | 0.0190003 | 0.0266201 | 0.0175230 | 0.0164721 | 0.0162117 | 0.0355746 | 0.0303433 | 0.0285037 | 0.0221734 | 0.0338473 | 0.0188846 | 0.0178398 | 0.0170678 | 0.0179127 | 0.0164681 | 0.0181886 | 0.0323123 | 0.0207600 | 0.0178579 | 0.0283760 | 0.0337146 | 0.0169339 | 0.0363946 | 0.0228502 |
| AS | 0.0212699 | 0.0000000 | 0.0186616 | 0.0329252 | 0.0235971 | 0.0224353 | 0.0253128 | 0.0183065 | 0.0245430 | 0.0258348 | 0.0176178 | 0.0197878 | 0.0189161 | 0.0335354 | 0.0296384 | 0.0260247 | 0.0236629 | 0.0337474 | 0.0174792 | 0.0202507 | 0.0204187 | 0.0200585 | 0.0185294 | 0.0197295 | 0.0297079 | 0.0201100 | 0.0185614 | 0.0272561 | 0.0262603 | 0.0216470 | 0.0280640 | 0.0152160 |
| BB | 0.0200213 | 0.0186616 | 0.0000000 | 0.0319076 | 0.0244552 | 0.0214392 | 0.0248414 | 0.0180955 | 0.0231898 | 0.0247534 | 0.0179690 | 0.0187357 | 0.0174954 | 0.0321670 | 0.0286313 | 0.0282892 | 0.0223153 | 0.0331018 | 0.0182653 | 0.0198932 | 0.0202839 | 0.0192087 | 0.0166406 | 0.0183536 | 0.0289344 | 0.0155876 | 0.0196366 | 0.0263040 | 0.0267735 | 0.0195137 | 0.0290556 | 0.0178810 |
| BH | 0.0346840 | 0.0329252 | 0.0319076 | 0.0000000 | 0.0341948 | 0.0348293 | 0.0410418 | 0.0333634 | 0.0331291 | 0.0158469 | 0.0328156 | 0.0339422 | 0.0340365 | 0.0252982 | 0.0261380 | 0.0397149 | 0.0390792 | 0.0295467 | 0.0321425 | 0.0365994 | 0.0370556 | 0.0365161 | 0.0333541 | 0.0299791 | 0.0154639 | 0.0342866 | 0.0354557 | 0.0220385 | 0.0350017 | 0.0367025 | 0.0365534 | 0.0304946 |
| CC | 0.0327182 | 0.0235971 | 0.0244552 | 0.0341948 | 0.0000000 | 0.0237590 | 0.0374692 | 0.0258122 | 0.0321994 | 0.0289741 | 0.0265930 | 0.0314310 | 0.0308357 | 0.0325807 | 0.0299775 | 0.0322526 | 0.0352532 | 0.0345354 | 0.0228768 | 0.0331576 | 0.0342563 | 0.0327005 | 0.0293691 | 0.0297479 | 0.0296424 | 0.0282460 | 0.0303569 | 0.0288153 | 0.0189424 | 0.0340537 | 0.0198847 | 0.0168270 |
| CF | 0.0267836 | 0.0224353 | 0.0214392 | 0.0348293 | 0.0237590 | 0.0000000 | 0.0294367 | 0.0231731 | 0.0279445 | 0.0281877 | 0.0232136 | 0.0245153 | 0.0219338 | 0.0340184 | 0.0315772 | 0.0322944 | 0.0277815 | 0.0364529 | 0.0231211 | 0.0258244 | 0.0272402 | 0.0251380 | 0.0218596 | 0.0236070 | 0.0300860 | 0.0206356 | 0.0260934 | 0.0295962 | 0.0262150 | 0.0268463 | 0.0277910 | 0.0209021 |
| CL | 0.0246469 | 0.0253128 | 0.0248414 | 0.0410418 | 0.0374692 | 0.0294367 | 0.0000000 | 0.0251011 | 0.0299262 | 0.0331434 | 0.0231170 | 0.0231664 | 0.0216017 | 0.0409459 | 0.0380001 | 0.0359048 | 0.0166704 | 0.0425012 | 0.0262426 | 0.0236694 | 0.0238168 | 0.0231343 | 0.0217015 | 0.0238932 | 0.0375295 | 0.0237068 | 0.0242312 | 0.0350436 | 0.0397157 | 0.0242384 | 0.0427709 | 0.0276300 |
| CN | 0.0189064 | 0.0183065 | 0.0180955 | 0.0333634 | 0.0258122 | 0.0231731 | 0.0251011 | 0.0000000 | 0.0229948 | 0.0253191 | 0.0149444 | 0.0173068 | 0.0171130 | 0.0341757 | 0.0295207 | 0.0233035 | 0.0221045 | 0.0332522 | 0.0130664 | 0.0180485 | 0.0172908 | 0.0177681 | 0.0162922 | 0.0178054 | 0.0305035 | 0.0193922 | 0.0175124 | 0.0271138 | 0.0276345 | 0.0174463 | 0.0301545 | 0.0171157 |
| EC | 0.0190003 | 0.0245430 | 0.0231898 | 0.0331291 | 0.0321994 | 0.0279445 | 0.0299262 | 0.0229948 | 0.0000000 | 0.0263762 | 0.0226038 | 0.0214934 | 0.0211447 | 0.0335061 | 0.0287778 | 0.0328450 | 0.0274776 | 0.0324182 | 0.0233297 | 0.0234037 | 0.0230408 | 0.0231056 | 0.0212814 | 0.0224650 | 0.0305677 | 0.0242452 | 0.0236657 | 0.0273561 | 0.0339258 | 0.0228591 | 0.0364258 | 0.0244998 |
| GA | 0.0266201 | 0.0258348 | 0.0247534 | 0.0158469 | 0.0289741 | 0.0281877 | 0.0331434 | 0.0253191 | 0.0263762 | 0.0000000 | 0.0251998 | 0.0256548 | 0.0263371 | 0.0236291 | 0.0221377 | 0.0338229 | 0.0312204 | 0.0257589 | 0.0239398 | 0.0283194 | 0.0280406 | 0.0278849 | 0.0253777 | 0.0224607 | 0.0151134 | 0.0270017 | 0.0268538 | 0.0184739 | 0.0294554 | 0.0275287 | 0.0315750 | 0.0235851 |
| GB | 0.0175230 | 0.0176178 | 0.0179690 | 0.0328156 | 0.0265930 | 0.0232136 | 0.0231170 | 0.0149444 | 0.0226038 | 0.0251998 | 0.0000000 | 0.0167527 | 0.0163814 | 0.0337995 | 0.0289854 | 0.0245593 | 0.0206033 | 0.0324782 | 0.0145730 | 0.0166393 | 0.0159460 | 0.0162265 | 0.0153821 | 0.0171601 | 0.0304754 | 0.0192077 | 0.0164729 | 0.0266545 | 0.0282273 | 0.0161405 | 0.0305152 | 0.0173996 |
| GJ | 0.0164721 | 0.0197878 | 0.0187357 | 0.0339422 | 0.0314310 | 0.0245153 | 0.0231664 | 0.0173068 | 0.0214934 | 0.0256548 | 0.0167527 | 0.0000000 | 0.0151043 | 0.0353594 | 0.0304644 | 0.0271990 | 0.0212521 | 0.0339120 | 0.0176500 | 0.0166582 | 0.0158933 | 0.0165035 | 0.0137941 | 0.0161212 | 0.0314389 | 0.0182391 | 0.0166257 | 0.0276478 | 0.0327586 | 0.0149816 | 0.0354195 | 0.0214097 |
| JV | 0.0162117 | 0.0189161 | 0.0174954 | 0.0340365 | 0.0308357 | 0.0219338 | 0.0216017 | 0.0171130 | 0.0211447 | 0.0263371 | 0.0163814 | 0.0151043 | 0.0000000 | 0.0344323 | 0.0307617 | 0.0287256 | 0.0191716 | 0.0352049 | 0.0189470 | 0.0157196 | 0.0164529 | 0.0153160 | 0.0131920 | 0.0159127 | 0.0308224 | 0.0156749 | 0.0179585 | 0.0279383 | 0.0339841 | 0.0166767 | 0.0363474 | 0.0213632 |
| KA | 0.0355746 | 0.0335354 | 0.0321670 | 0.0252982 | 0.0325807 | 0.0340184 | 0.0409459 | 0.0341757 | 0.0335061 | 0.0236291 | 0.0337995 | 0.0353594 | 0.0344323 | 0.0000000 | 0.0190681 | 0.0414175 | 0.0389326 | 0.0235086 | 0.0329279 | 0.0374017 | 0.0383461 | 0.0366842 | 0.0342726 | 0.0317150 | 0.0193344 | 0.0337279 | 0.0370079 | 0.0175659 | 0.0345171 | 0.0383341 | 0.0358064 | 0.0298833 |
| KB | 0.0303433 | 0.0296384 | 0.0286313 | 0.0261380 | 0.0299775 | 0.0315772 | 0.0380001 | 0.0295207 | 0.0287778 | 0.0221377 | 0.0289854 | 0.0304644 | 0.0307617 | 0.0190681 | 0.0000000 | 0.0368059 | 0.0359200 | 0.0117009 | 0.0277477 | 0.0323038 | 0.0326546 | 0.0324584 | 0.0301154 | 0.0282422 | 0.0217759 | 0.0314428 | 0.0311908 | 0.0167801 | 0.0306686 | 0.0325043 | 0.0325571 | 0.0264465 |
| KM | 0.0285037 | 0.0260247 | 0.0282892 | 0.0397149 | 0.0322526 | 0.0322944 | 0.0359048 | 0.0233035 | 0.0328450 | 0.0338229 | 0.0245593 | 0.0271990 | 0.0287256 | 0.0414175 | 0.0368059 | 0.0000000 | 0.0352685 | 0.0398091 | 0.0221475 | 0.0294939 | 0.0284646 | 0.0302520 | 0.0271601 | 0.0284409 | 0.0384818 | 0.0309566 | 0.0264761 | 0.0357467 | 0.0334697 | 0.0279264 | 0.0353186 | 0.0258690 |
| LC | 0.0221734 | 0.0236629 | 0.0223153 | 0.0390792 | 0.0352532 | 0.0277815 | 0.0166704 | 0.0221045 | 0.0274776 | 0.0312204 | 0.0206033 | 0.0212521 | 0.0191716 | 0.0389326 | 0.0359200 | 0.0352685 | 0.0000000 | 0.0406755 | 0.0236781 | 0.0207346 | 0.0214815 | 0.0193848 | 0.0195040 | 0.0214650 | 0.0355828 | 0.0210133 | 0.0221636 | 0.0324812 | 0.0372541 | 0.0218466 | 0.0410061 | 0.0251646 |
| LL | 0.0338473 | 0.0337474 | 0.0331018 | 0.0295467 | 0.0345354 | 0.0364529 | 0.0425012 | 0.0332522 | 0.0324182 | 0.0257589 | 0.0324782 | 0.0339120 | 0.0352049 | 0.0235086 | 0.0117009 | 0.0398091 | 0.0406755 | 0.0000000 | 0.0309655 | 0.0359579 | 0.0358022 | 0.0360280 | 0.0341083 | 0.0318546 | 0.0261726 | 0.0363706 | 0.0344018 | 0.0211644 | 0.0340164 | 0.0355534 | 0.0361843 | 0.0305224 |
| LM | 0.0188846 | 0.0174792 | 0.0182653 | 0.0321425 | 0.0228768 | 0.0231211 | 0.0262426 | 0.0130664 | 0.0233297 | 0.0239398 | 0.0145730 | 0.0176500 | 0.0189470 | 0.0329279 | 0.0277477 | 0.0221475 | 0.0236781 | 0.0309655 | 0.0000000 | 0.0188936 | 0.0179248 | 0.0188001 | 0.0172917 | 0.0183495 | 0.0295287 | 0.0207955 | 0.0168323 | 0.0256663 | 0.0238840 | 0.0177935 | 0.0264541 | 0.0149584 |
| MM | 0.0178398 | 0.0202507 | 0.0198932 | 0.0365994 | 0.0331576 | 0.0258244 | 0.0236694 | 0.0180485 | 0.0234037 | 0.0283194 | 0.0166393 | 0.0166582 | 0.0157196 | 0.0374017 | 0.0323038 | 0.0294939 | 0.0207346 | 0.0359579 | 0.0188936 | 0.0000000 | 0.0125471 | 0.0124966 | 0.0154447 | 0.0177282 | 0.0337550 | 0.0192789 | 0.0166609 | 0.0293564 | 0.0347920 | 0.0166414 | 0.0375904 | 0.0223864 |
| MT | 0.0170678 | 0.0204187 | 0.0202839 | 0.0370556 | 0.0342563 | 0.0272402 | 0.0238168 | 0.0172908 | 0.0230408 | 0.0280406 | 0.0159460 | 0.0158933 | 0.0164529 | 0.0383461 | 0.0326546 | 0.0284646 | 0.0214815 | 0.0358022 | 0.0179248 | 0.0125471 | 0.0000000 | 0.0125909 | 0.0156450 | 0.0177622 | 0.0349554 | 0.0205679 | 0.0152600 | 0.0297690 | 0.0350223 | 0.0144608 | 0.0381636 | 0.0227158 |
| QG | 0.0179127 | 0.0200585 | 0.0192087 | 0.0365161 | 0.0327005 | 0.0251380 | 0.0231343 | 0.0177681 | 0.0231056 | 0.0278849 | 0.0162265 | 0.0165035 | 0.0153160 | 0.0366842 | 0.0324584 | 0.0302520 | 0.0193848 | 0.0360280 | 0.0188001 | 0.0124966 | 0.0125909 | 0.0000000 | 0.0150762 | 0.0174641 | 0.0333414 | 0.0184967 | 0.0165357 | 0.0293283 | 0.0343993 | 0.0161502 | 0.0377306 | 0.0215739 |
| RC | 0.0164681 | 0.0185294 | 0.0166406 | 0.0333541 | 0.0293691 | 0.0218596 | 0.0217015 | 0.0162922 | 0.0212814 | 0.0253777 | 0.0153821 | 0.0137941 | 0.0131920 | 0.0342726 | 0.0301154 | 0.0271601 | 0.0195040 | 0.0341083 | 0.0172917 | 0.0154447 | 0.0156450 | 0.0150762 | 0.0000000 | 0.0138413 | 0.0306514 | 0.0152993 | 0.0169273 | 0.0274372 | 0.0318366 | 0.0144455 | 0.0345417 | 0.0203048 |
| RL | 0.0181886 | 0.0197295 | 0.0183536 | 0.0299791 | 0.0297479 | 0.0236070 | 0.0238932 | 0.0178054 | 0.0224650 | 0.0224607 | 0.0171601 | 0.0161212 | 0.0159127 | 0.0317150 | 0.0282422 | 0.0284409 | 0.0214650 | 0.0318546 | 0.0183495 | 0.0177282 | 0.0177622 | 0.0174641 | 0.0138413 | 0.0000000 | 0.0275385 | 0.0181671 | 0.0185550 | 0.0253345 | 0.0313771 | 0.0167345 | 0.0345433 | 0.0207855 |
| RP | 0.0323123 | 0.0297079 | 0.0289344 | 0.0154639 | 0.0296424 | 0.0300860 | 0.0375295 | 0.0305035 | 0.0305677 | 0.0151134 | 0.0304754 | 0.0314389 | 0.0308224 | 0.0193344 | 0.0217759 | 0.0384818 | 0.0355828 | 0.0261726 | 0.0295287 | 0.0337550 | 0.0349554 | 0.0333414 | 0.0306514 | 0.0275385 | 0.0000000 | 0.0302712 | 0.0332392 | 0.0177010 | 0.0313996 | 0.0345974 | 0.0331327 | 0.0267652 |
| SC | 0.0207600 | 0.0201100 | 0.0155876 | 0.0342866 | 0.0282460 | 0.0206356 | 0.0237068 | 0.0193922 | 0.0242452 | 0.0270017 | 0.0192077 | 0.0182391 | 0.0156749 | 0.0337279 | 0.0314428 | 0.0309566 | 0.0210133 | 0.0363706 | 0.0207955 | 0.0192789 | 0.0205679 | 0.0184967 | 0.0152993 | 0.0181671 | 0.0302712 | 0.0000000 | 0.0214085 | 0.0286619 | 0.0325139 | 0.0200717 | 0.0347512 | 0.0207936 |
| SD | 0.0178579 | 0.0185614 | 0.0196366 | 0.0354557 | 0.0303569 | 0.0260934 | 0.0242312 | 0.0175124 | 0.0236657 | 0.0268538 | 0.0164729 | 0.0166257 | 0.0179585 | 0.0370079 | 0.0311908 | 0.0264761 | 0.0221636 | 0.0344018 | 0.0168323 | 0.0166609 | 0.0152600 | 0.0165357 | 0.0169273 | 0.0185550 | 0.0332392 | 0.0214085 | 0.0000000 | 0.0288431 | 0.0303812 | 0.0160337 | 0.0328389 | 0.0200819 |
| SL | 0.0283760 | 0.0272561 | 0.0263040 | 0.0220385 | 0.0288153 | 0.0295962 | 0.0350436 | 0.0271138 | 0.0273561 | 0.0184739 | 0.0266545 | 0.0276478 | 0.0279383 | 0.0175659 | 0.0167801 | 0.0357467 | 0.0324812 | 0.0211644 | 0.0256663 | 0.0293564 | 0.0297690 | 0.0293283 | 0.0274372 | 0.0253345 | 0.0177010 | 0.0286619 | 0.0288431 | 0.0000000 | 0.0293650 | 0.0301524 | 0.0314807 | 0.0244301 |
| SP | 0.0337146 | 0.0262603 | 0.0267735 | 0.0350017 | 0.0189424 | 0.0262150 | 0.0397157 | 0.0276345 | 0.0339258 | 0.0294554 | 0.0282273 | 0.0327586 | 0.0339841 | 0.0345171 | 0.0306686 | 0.0334697 | 0.0372541 | 0.0340164 | 0.0238840 | 0.0347920 | 0.0350223 | 0.0343993 | 0.0318366 | 0.0313771 | 0.0313996 | 0.0325139 | 0.0303812 | 0.0293650 | 0.0000000 | 0.0344131 | 0.0174452 | 0.0192507 |
| SY | 0.0169339 | 0.0216470 | 0.0195137 | 0.0367025 | 0.0340537 | 0.0268463 | 0.0242384 | 0.0174463 | 0.0228591 | 0.0275287 | 0.0161405 | 0.0149816 | 0.0166767 | 0.0383341 | 0.0325043 | 0.0279264 | 0.0218466 | 0.0355534 | 0.0177935 | 0.0166414 | 0.0144608 | 0.0161502 | 0.0144455 | 0.0167345 | 0.0345974 | 0.0200717 | 0.0160337 | 0.0301524 | 0.0344131 | 0.0000000 | 0.0375182 | 0.0229745 |
| TM | 0.0363946 | 0.0280640 | 0.0290556 | 0.0365534 | 0.0198847 | 0.0277910 | 0.0427709 | 0.0301545 | 0.0364258 | 0.0315750 | 0.0305152 | 0.0354195 | 0.0363474 | 0.0358064 | 0.0325571 | 0.0353186 | 0.0410061 | 0.0361843 | 0.0264541 | 0.0375904 | 0.0381636 | 0.0377306 | 0.0345417 | 0.0345433 | 0.0331327 | 0.0347512 | 0.0328389 | 0.0314807 | 0.0174452 | 0.0375182 | 0.0000000 | 0.0210962 |
| WC | 0.0228502 | 0.0152160 | 0.0178810 | 0.0304946 | 0.0168270 | 0.0209021 | 0.0276300 | 0.0171157 | 0.0244998 | 0.0235851 | 0.0173996 | 0.0214097 | 0.0213632 | 0.0298833 | 0.0264465 | 0.0258690 | 0.0251646 | 0.0305224 | 0.0149584 | 0.0223864 | 0.0227158 | 0.0215739 | 0.0203048 | 0.0207855 | 0.0267652 | 0.0207936 | 0.0200819 | 0.0244301 | 0.0192507 | 0.0229745 | 0.0210962 | 0.0000000 |
| X | Sigma | X. |
|---|---|---|
| Variations Between samples | 278.0909 | 22.07262 |
| Variations Within samples | 981.7995 | 77.92738 |
| Total variations | 1259.8903 | 100.00000 |
Map, PCA, UMAP, entropy:
B. sagittata map:
#devtools::install_github("dkahle/ggmap")
library(ggmap)
library(devtools)
require(ggplot2)
require(ggsci)
require(ggrepel)
library(data.table)
library(patchwork)
setwd("/home/caro/Escritorio/BASA_2/con_outgroup/Map/")
basacoord <- read.csv('coordinates.csv', header=T)
names(basacoord) <- c('Pop','Lat','Long')
map <- get_stamenmap(bbox = c(left = -120.5, bottom = 37.00, right = -109.0, top = 49.00),
zoom=8,scale = 3,maptype = 'terrain-background')
##zoom controls resolution, 10 takes forever
ggmap(map) + geom_point(data = basacoord, aes(x = Long, y = Lat),size=3,pch=21,fill="white",col="black") +
xlab("Longitude") + ylab("Latitude") +
coord_map(xlim=c(-120.5,-109.0),ylim=c(37.00,49.00)) #selects the range for x and y
group.colors <- c(AN = "#F0F8FF", AS ="#E9967A", BH ="#FFEBCD", BB = "#F08080",CF = "#8B0000",CC ="#FF3030",CN ="#CD6090",CL = "#8B3A62", EC = "#FF82AB",
GA ="#FFC0CB",GB ="#FFE1FF", GJ="#CDB5CD",JV ="#8B7B8B", KA ="#9B30FF",KB="#8470FF",KM ="#27408B",LM ="#87CEFA",
LL="#104E8B",LC="#8DB6CD",MM="#6E7B8B",MT="#FFB90F",QG="#EE7600",RC="#FFF68F",RL="#8B7500",
SL="#2F4F4F",SD="#008B8B",SP="#006400",SC="#BDB76B", SY="#ADFF2F", TM="#00CD66",RP="#00CDCD", WC="#BBFFFF")
map <- ggmap(map) + geom_point(data = basacoord, aes(x = Long, y = Lat),size=2,col="black",fill='white',pch=21) +
geom_label_repel(data = basacoord, aes(x = Long, y = Lat,label = Pop,fill=Pop),
colour = "black", fontface = "bold") +
scale_fill_manual(values=group.colors) +
coord_map(xlim=c(-120.5,-109.0),ylim=c(37.00,49.00)) +
xlab("Longitude") + ylab("Latitude") + theme_bw() +
theme(legend.position = 'none',
axis.text = element_text(size=13),
axis.title = element_text(size = 15, colour="black",face = "bold", vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
map
ggsave('map_basa.pdf',height=8,width=5)
PCA:
library(data.table)
library(ggplot2)
library(ggsci)
library(umap)
library(LEA)
library(readr)
library(ggpubr)
setwd("/home/caro/Escritorio/BASA_2/con_outgroup/PCA_UMAP/")
##### PCA #####
PCA_gen <- function(df_gen,indv,num=5){ #add ggplot, add tw, add # of output
#pkgTest("ggplot2")
df_gen <- apply(df_gen, 2, function(df) gsub(-1,NA,df,fixed=TRUE))
df_gen <- apply(df_gen, 2, function(df) as.numeric(df))
colmean<-apply(df_gen,2,mean,na.rm=TRUE)
normalize<-matrix(nrow = nrow(df_gen),ncol=ncol(df_gen))
af<-colmean/2
for (m in 1:length(af)){
nr<-df_gen[ ,m]-colmean[m]
dn<-sqrt(af[m]*(1-af[m]))
normalize[ ,m]<-nr/dn
}
normalize[is.na(normalize)]<-0
method1<-prcomp(normalize, scale. = FALSE,center = FALSE)
pve <- summary(method1)$importance[2,]
print(pve[1:50])
if(nrow(df_gen) < num){
num <- nrow(df_gen)
}
pca_X<- method1$x[,1:num]
pca_X <- as.data.frame(pca_X)
pca_X$Pop <- indv$Pop
pca_X$ID <- indv$ID
pca_out <- list("pca_df"=pca_X,"pve"=pve)
#print(PCA_fig(pca_out))
return(pca_out)
}
######################## PCA_fig() ###################
PCA_fig <- function(pca_out,fill="Pop"){ #add PCA setting or normal
xlab <- paste("PCA1 (",round(pca_out$pve[1]*100,2),"%)",sep="")
ylab <- paste("PCA2 (",round(pca_out$pve[2]*100,2),"%)",sep="")
pca_df <- pca_out$pca_df
filler <- as.character(pca_df[[fill]])
ggplot(data = pca_df, aes(x=PC1,y=PC2,fill=filler)) +
geom_point(pch=21,colour='black',size = 3)+
xlab(xlab) + ylab(ylab) +
theme_bw() +
theme(#legend.position = 'none',
axis.text = element_text(size=13),
axis.title = element_text(size = 16, colour="black",face = "bold",vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
legend.text = element_text(size=13),
legend.title = element_text(size = 16, colour="black",face = "bold",vjust = 1),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
}
######
df012<-fread("basa.012",sep="\t", data.table=F)
df012 <- df012[,-1]
Pop_ID_Sum <- read.csv('Pop_ID_sin.csv')
#############
pca_out <- PCA_gen(df012,Pop_ID_Sum)
pve <- pca_out$pve[1:50]
pve
pca_df <- pca_out$pca_df[,1:50]
pca_df <- cbind(Pop_ID_Sum,pca_df)
write.csv(pca_df,'pca_df_.csv',row.names = FALSE)
##############
pca_df <- read.csv("pca_df.csv")
pve <- c(0.06575, 0.03794, 0.02006, 0.01616, 0.01205)
group.colors <- c(AN = "#F0F8FF", AS ="#E9967A", BH ="#FFEBCD", BB = "#F08080",CF = "#8B0000",CC ="#FF3030",CN ="#CD6090",CL = "#8B3A62", EC = "#FF82AB", GA
="#FFC0CB",GB ="#FFE1FF", GJ="#CDB5CD",JV ="#8B7B8B", KA ="#9B30FF",KB="#8470FF",KM ="#27408B",LM ="#87CEFA",
LL="#104E8B",LC="#8DB6CD",MM="#6E7B8B",MT="#FFB90F",QG="#EE7600",RC="#FFF68F",RL="#8B7500",
SL="#2F4F4F",SD="#008B8B",SP="#006400",SC="#BDB76B", SY="#ADFF2F", TM="#00CD66",RP="#00CDCD", WC="#BBFFFF"#,B.hookeri="#ffff00ff",
#BAH03xB.hybrid="#000000ba")
PCA1VS2 <- ggplot(data = pca_df, aes(x=PC1,y=PC2,fill=as.character(Pop), color=Pop)) +
geom_point(pch=21,colour='black',size = 5) +
xlab(paste("PC",1," (",pve[1]*100,"%)",sep="")) + ylab(paste("PC",2," (",pve[2]*100,"%)",sep=""))+
scale_fill_manual(values=group.colors)+ theme_bw() +
theme(legend.position = 'right',
axis.text = element_text(size=11),
axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggsave("pca1_2.pdf",PCA1VS2,height=8,width = 10,units = 'in')
UMAP:
#### UMAP
custom.settings = umap.defaults
custom.settings$min_dist = 0.9
custom.settings$n_neighbors = 14
umap_g <- as.data.frame(umap(df012,config = custom.settings)$layout )
names(umap_g) <- c('layout1','layout2')
umap_g <- cbind(Pop_ID_Sum,umap_g)
#### UMAP gprob ####
umap_g_plot <- ggplot(data = umap_g, aes(x=layout1,y=layout2,fill=as.character(Pop))) +
geom_point(colour='black',size = 5,pch=21) + #ggtitle("UMAP n_neighbors 14 min_dist 0.8") +
xlab('Layout 1') + ylab('Layout 2') +
scale_fill_manual(values=group.colors) +
theme_bw() +
theme(legend.position = 'right',
axis.text = element_text(size=11),
axis.title = element_text(size = 13, colour="black",face = "bold",vjust = 1),
panel.border = element_rect(size = 1.5, colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
ggsave("umap_09_14.pdf",umap_g_plot,height=8,width = 10,units = 'in')
entropy:
#perl /working/mascaro/acth/entropy2/vcf2mpglV1.3TLP.pl basa.vcf
#perl /working/mascaro/acth/entropy2/gl2genestV1.3.pl basa.mpgl mean
########### PCA_entropy
##row = loci, col = indv
require(readr)
require(MASS)
require(LEA)
require(ggplot2)
PCA_entropy <- function(g){
colmean<-apply(g,2,mean,na.rm=TRUE)
normalize<-matrix(nrow = nrow(g),ncol=ncol(g))
af<-colmean/2
for (m in 1:length(af)){
nr<-g[ ,m]-colmean[m]
dn<-sqrt(af[m]*(1-af[m]))
normalize[ ,m]<-nr/dn
}
normalize[is.na(normalize)]<-0
method1<-prcomp(normalize, scale. = FALSE,center = FALSE)
pve <- summary(method1)$importance[2,]
print(pve[1:5])
pca_df<- method1$x[,1:27]
return(pca_df)
}
require(readr)
require(MASS)
require(LEA)
require(ggplot2)
g <- read.table("pntest_mean_variants_maf5_miss9_thin100_noBadInds.recode.txt", header=F)
Pop_ID_Sum <- read.csv("Pop_ID.csv")
pca_df <- PCA_entropy(t(g))
#Header for entropy:
Pop_ID <- read.csv("Pop_ID.csv")
Sp_Pop <- paste("AT",Pop_ID$Pop,sep="_")
Pop_ID <- paste(Pop_ID$Pop,Pop_ID$ID,sep="_")
Header <- data.frame(Sp_Pop,Pop_ID)
write.table(t(Header),'entropy_header.txt',sep = " ", quote = FALSE,row.names =FALSE)
dim(g)
#[1] loci_individuals
Pop_ID_Sum <- read.csv("Pop_ID_Sum.csv")
pca_df <- PCA_entropy(t(g)) ######change PCA_entropy to allow for selecting PC# output and tw
test
library(MASS)
k2<-kmeans(pca_df[,1:5],2,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k3<-kmeans(pca_df[,1:5],3,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k4<-kmeans(pca_df[,1:5],4,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k5<-kmeans(pca_df[,1:5],5,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k6<-kmeans(pca_df[,1:5],6,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k7<-kmeans(pca_df[,1:5],7,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k8<-kmeans(pca_df[,1:5],8,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k9<-kmeans(pca_df[,1:5],9,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
k10<-kmeans(pca_df[,1:5],10,iter.max=10,nstart=10,algorithm="Hartigan-Wong")
ldak2<-lda(x=pca_df[,1:5],grouping=k2$cluster,CV=TRUE)
ldak3<-lda(x=pca_df[,1:5],grouping=k3$cluster,CV=TRUE)
ldak4<-lda(x=pca_df[,1:5],grouping=k4$cluster,CV=TRUE)
ldak5<-lda(x=pca_df[,1:5],grouping=k5$cluster,CV=TRUE)
ldak6<-lda(x=pca_df[,1:5],grouping=k6$cluster,CV=TRUE)
ldak7<-lda(x=pca_df[,1:5],grouping=k7$cluster,CV=TRUE)
ldak8<-lda(x=pca_df[,1:5],grouping=k8$cluster,CV=TRUE)
ldak9<-lda(x=pca_df[,1:5],grouping=k9$cluster,CV=TRUE)
ldak10<-lda(x=pca_df[,1:5],grouping=k10$cluster,CV=TRUE)
write.table(round(ldak2$posterior,5),file="ldak2.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak3$posterior,5),file="ldak3.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak4$posterior,5),file="ldak4.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak5$posterior,5),file="ldak5.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak6$posterior,5),file="ldak6.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak7$posterior,5),file="ldak7.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak8$posterior,5),file="ldak8.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak9$posterior,5),file="ldak9.txt",quote=F,row.names=F,col.names=F)
write.table(round(ldak10$posterior,5),file="ldak10.txt",quote=F,row.names=F,col.names=F)
#cat entropy_header.txt basa.mpgl > good_snps.entropy.mpgl
## add 584 36161 1 to the top of entropy_header.txt y dejas esa linea vacia! mirar el ejemplo
mio#########
entropy -i basa_entropy.mpgl -o basa_k2.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 2 -q
ldak2.txt -m 1 -w 0 &> k2stdout.txt
entropy -i basa_entropy.mpgl -o basa_k3.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 3 -q
ldak2.txt -m 1 -w 0 &> k3stdout.txt
entropy -i basa_entropy.mpgl -o basa_k4.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 4 -q
ldak4.txt -m 1 -w 0 &> k4stdout.txt
entropy -i basa_entropy.mpgl -o basa_k5.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 5 -q
ldak5.txt -m 1 -w 0 &> k5stdout.txt
entropy -i basa_entropy.mpgl -o basa_k6.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 6 -q
ldak6.txt -m 1 -w 0 &> k6stdout.txt
entropy -i basa_entropy.mpgl -o basa_k7.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 7 -q
ldak7.txt -m 1 -w 0 &> k7stdout.txt
entropy -i basa_entropy.mpgl -o basa_k8.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 8 -q
ldak8.txt -m 1 -w 0 &> k8stdout.txt
entropy -i basa_entropy.mpgl -o basa_k9.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 9 -q
ldak9.txt -m 1 -w 0 &> k9stdout.txt
entropy -i basa_entropy.mpgl -o basa_k10.hdf5 -l 70000 -b 30000 -t 10 -s 20 -e .01 -k 10 -q
ldak10.txt -m 1 -w 0 &>
k10stdout.txt
### DIC value
module load entropy/1.2
###Get the DICs values for each K value:
estpost.entropy basa_k2.hdf5 -s 3 -p deviance > DIC_k2.txt
estpost.entropy basa_k3.hdf5 -s 3 -p deviance > DIC_k3.txt
estpost.entropy basa_k4.hdf5 -s 3 -p deviance > DIC_k4.txt
estpost.entropy basa_k5.hdf5 -s 3 -p deviance > DIC_k5.txt
estpost.entropy basa_k6.hdf5 -s 3 -p deviance > DIC_k6.txt
estpost.entropy basa_k7.hdf5 -s 3 -p deviance > DIC_k7.txt
estpost.entropy basa_k8.hdf5 -s 3 -p deviance > DIC_k8.txt
estpost.entropy basa_k9.hdf5 -s 3 -p deviance > DIC_k9.txt
estpost.entropy basa_k10.hdf5 -s 3 -p deviance > DIC_k10.txt
###Generate entropy q files:
estpost.entropy basa_k2.hdf5 -p q -s 0 -o q2.txt
estpost.entropy basa_k3.hdf5 -p q -s 0 -o q3.txt
estpost.entropy basa_k4.hdf5 -p q -s 0 -o q4.txt
estpost.entropy basa_k5.hdf5 -p q -s 0 -o q5.txt
estpost.entropy basa_k6.hdf5 -p q -s 0 -o q6.txt
estpost.entropy basa_k7.hdf5 -p q -s 0 -o q7.txt
estpost.entropy basa_k8.hdf5 -p q -s 0 -o q8.txt
estpost.entropy basa_k9.hdf5 -p q -s 0 -o q9.txt
estpost.entropy basa_k10.hdf5 -p q -s 0 -o q10.txt
###Generate gprobs file:
estpost.entropy basa_k2.hdf5 -p gprob -s 0 -o gprob.txt
# plotting
library(ggplot2)
library(forcats)
library(ggthemes)
library(patchwork)
kdf2 <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/entropy/definitivo/q2.csv"
)
k2plot <-
ggplot(kdf2, aes(factor(sampleID), prob, fill = factor(popGroup))) +
geom_col(color = "gray", size = 0.1) +
facet_grid(~fct_inorder(loc), switch = "x", scales = "free", space = "free") +
theme_minimal() + labs(x = "Populations", title = "K=2", y = "Ancestry") +
scale_y_continuous(expand = c(0, 0)) +
scale_x_discrete(expand = expansion(add = 1)) +
theme(
panel.spacing.x = unit(0.1, "lines"),
axis.text.x = element_blank(),
panel.grid = element_blank()
) +
scale_fill_gdocs(guide = FALSE)
k2plot
ggsave("k2.pdf",k3plot,height=8,width = 20,units = 'in')
pRDA:
############## RDA
library(adegenet)
library(poppr)
library(tidyverse)
library(vcfR)
library(hierfstat)
library(pegas)
library(magrittr)
library(ape)
library(psych)
library(adespatial)
library(vegan)
## Loading the genetic data
setwd("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/")
##setwd("/home/caro/Escritorio/")
gen <- read.csv("basa_prda.012", row.names=1)
##gen <- read.csv("acth_012.csv", row.names=1)
dim(gen)
sum(is.na(gen))
gen.imp <- apply(gen, 2, function(x) replace(x, is.na(x), as.numeric(names(which.max(table(x))))))
## Env. variables
Env <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Env.csv", header = TRUE)
Coordinates <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Lon_Lat.csv")
PopStruct <- read.csv("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/pca_df_.csv")
## Table gathering all variables
Variables <- data.frame(Env, Coordinates, PopStruct)
Variables[1:5,]
## Null model
RDA0 <- rda(gen.imp ~ 1, Variables)
## Full model
RDAfull <- rda(gen.imp ~ ppt+tdmean+tmax+tmean+tmin+vpdmax+vpdmin+slope+Af+soilawc+sloprad+afrad+heatload+CumlWs+DurCWD+OnsetCWD+CumlAET+CumlCWD+CumlPET+WsAE
Tspr+FallAET+MaxCWD+pcseas+CumlGDD+AETgdd+DurGDD+mxtmpfal+mxtmpspr+mxtmpsum+mxtmpwin+mntmpfal+ mntmpspr+
mntmpsu+mntmpwin+prcpfal+prcpspr+prcpsum+prcpwin+Elevation+ mndtmpfal+mndtmpspr+mndtmpsu+mndtmpwin+Lon+Lat+PC1+PC2, Variables)
#Quito esta: +DeclAET y quito los guiones bajos de ID
## "To conduct the selection procedure we used the ordiR2step function of the package vegan and the following stopping criteria: variable significance of p <
0.01 using 1000 permutations, and the adjusted R2 of the global model."
mod <- ordiR2step(RDA0, RDAfull, Pin = 0.01, R2permutations = 1000, R2scope = T, direction = T)
mod$anova
## Removing correlated predictors:
env <- subset(Variables, select=c(tmin,vpdmin,sloprad,prcpfal,mndtmpfal,tdmean,mxtmpfal,mntmpsu,mxtmpsum,mndtmpsu,mndtmpwin,AETgdd,CumlAET,soilawc,prcpwin,Cu
mlCWD,mntmpfal,MaxCWD,CumlGDD,tmax,tmean,pcseas,afrad,mntmpspr,OnsetCWD,mndtmpspr,CumlWs,mntmpwin,DurCWD))
str(env)
correlation <-cor(env, method = "pearson")
write.csv(correlation, "/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/correlation1.csv")
env2 <- subset(Variables,
select=c(sloprad,prcpfal,mndtmpfal,tdmean,mxtmpfal,mntmpsu,mxtmpsum,soilawc,prcp
win,CumlCWD,MaxCWD,CumlGDD,tmax,pcseas,afrad,OnsetCWD,CumlWs))
correlation2 <-cor(env2, method = "pearson")
write.csv(correlation2, "/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/correlation2.c
sv")
env3 <- subset(Variables,
select=c(sloprad,tdmean,mxtmpfal,mxtmpsum,soilawc,prcpwin,tmax,pcseas,afrad,Onse
tCWD,CumlWs))
pairs.panels(env3, scale = TRUE, density =TRUE, ellipses =TRUE, methods="pearson")
pdf("/home/caro/Escritorio/BASA_2/con_outgroup/bcftools/pRDA/Pairs_pannel3.pdf")
#######
####### Selected: sloprad,tdmean,mxtmpfal,mxtmpsum,soilawc,prcpwin,tmax,pcseas,afrad,OnsetCWD,CumlWs
## Full model
pRDAfull <- rda(gen.imp ~ PC1 + PC2+ + Lon + Lat+ sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs, Variables)
pRDAfull
#RsquareAdj(pRDAfull)
#anova(pRDAfull)
## Pure climate model
pRDAclim <- rda(gen.imp ~ sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs + Condition(PC1 + PC2+ Lon + Lat),Variables)
pRDAclim
#RsquareAdj(pRDAclim)
#anova(pRDAclim)
# Model summaries
#RsquareAdj(pRDAclim) # adjusted Rsquared
#The eigenvalues for the constrained axes reflect the variance explained by each canonical axis:
#summary(pRDAclim, display=NULL) #display = NULL optional
#summary(eigenvals(pRDAclim, model = "constrained"))
#screeplot(pRDAclim)
#check our RDA model for significance using formal tests. We can assess both the full model and each constrained axis using F-statistics (Legendre et al,
2010). The null hypothesis is that no linear relationship exists between the SNP data and the environmental predictors.
#anova.cca(pRDAclim, permutations = 100) # full model
#anova.cca(pRDAclim, permutations = 100, by="margin") # per variable
#signif.axis <- anova.cca(pRDAclim, by="axis", parallel=getOption("mc.cores"))
#signif.axis
#vif.cca(pRDAclim) # variance inflation factor (<10 OK)
# Model: rda(formula = gen.imp ~ AETgdd + AfRad + FallAET + MaxCWD + mxtmpwin + prcpsum + SlopRad + WsAETspr + Condition(PC1 + PC2 + Lon + Lat), data =
Variables)
# Df Variance F Pr(>F)
# AETgdd 1 11.505 15.742 0.009901 **
# AfRad 1 17.800 24.356 0.009901 **
# FallAET 1 14.477 19.808 0.009901 **
# MaxCWD 1 21.014 28.754 0.009901 **
# mxtmpwin 1 15.724 21.516 0.009901 **
# prcpsum 1 16.850 23.056 0.009901 **
# SlopRad 1 13.357 18.276 0.009901 **
# WsAETspr 1 15.852 21.691 0.009901 **
## Pure neutral population structure model
pRDAstruct <- rda(gen.imp ~ PC1 + PC2 + Condition(Lon + Lat + sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs),
Variables)
pRDAstruct
#RsquareAdj(pRDAstruct)
#anova(pRDAstruct)
##Pure geography model
pRDAgeog <- rda(gen.imp ~ Lon + Lat + Condition(sloprad+tdmean+mxtmpfal+mxtmpsum+soilawc+prcpwin+tmax+pcseas+afrad+OnsetCWD+CumlWs + PC1 + PC2), Variables)
pRDAgeog
#RsquareAdj(pRDAgeog)
#anova(pRDAgeog)
| Partial.RDA.models | Inertia | R2 | P…..F.. | Proportion.of.explainable.variance | Proportion.of.total.variance |
|---|---|---|---|---|---|
| Full model: gen ~ env + structur + geog | 30.45 | 0.09 | 0.001*** | 1.000 | 0.090 |
| Pure climate: gen ~ env (structur + geog) | 14.71 | 0.04 | 0.001*** | 0.483 | 0.043 |
| Pure structure: gen ~ structur (env + geog) | 6.04 | 0.02 | 0.001*** | 0.182 | 0.016 |
| Pure geography: gen ~ geog (env + structur) | 3.13 | 0.01 | 0.001*** | 0.115 | 0.010 |
| NA | NA | NA | NA | ||
| Counfounded climate/structure/geography | 6.57 | NA | 0.233 | 0.021 | |
| NA | NA | NA | NA | ||
| Total unexplained | 298.20 | NA | NA | NA | |
| Total inertia | 328.65 | NA | NA | NA |
#EEMS
diffs <- read.table("./test/example-SNP-major-mode.diffs")
diffs
datapath = ./data/inputdata
mcmcpath = ./data/outputdata
nIndiv = 246
nSites = 5677
nDemes = 300
diploid = false
numMCMCIter = 200000000
numBurnIter = 100000000
numThinIter = 999999
./runeems_snps --params configurationfile.ini --seed 123 (randome seed, optional)
./runeems_snps --params configurationfile2.ini --seed 123
./runeems_snps --params configurationfile3.ini --seed 123
EEMS results visualization:
library(rEEMSplots)
mcmcpath <- c("./eems_output/output_1", "./eems_output/output_2", "./eems_output/output_3")
plotpath = (""./eems_output/plots"")
#unPC
library(unPC)
unPC(inputToProcess = "pca_eigen.evec", outputPrefix = "unPC",
runDataImport = TRUE, runPairwiseCalc = TRUE, geogrCoords= "Lon_Lat.txt",
roundEarth = TRUE, firstPC = 1, secondPC = 2, runPlotting = TRUE,
applyManualColor = FALSE, geogrCoordsForPlotting = "Lon_Lat_popnAggregated.txt",
plotWithMap = FALSE,
ellipseWidth = 0.05, populationPointNormalization = 7,
runAggregated = TRUE, savePlotsToPdf = TRUE)
unpcout <- readRDS("unPC_pairwiseDistCalc_unPC.rds")
unPC <- unpcout$ratioPCToGeogrDist
write.csv(unpcout, "unPCscores.csv")
Genetic distances:
B. hookeri seems to be more differentiated from B. sagitatta and B. hybrid based on pairwise genetic distances.
.inline-btns p {
display: inline;
}
library(downloadthis)
Fst_results <- read.csv("/home/caro/Escritorio/BASA_2/basa_Rmd/Fst.csv")
d_btn <- Fst_results %>%
download_this(
path = system.file("/home/caro/Escritorio/BASA_2/basa_Rmd/", package = "downloadthis"),
output_name = "Genetic distances",
output_extension = ".csv",
button_label = "Genetic distances",
button_type = "default",
has_icon = TRUE,
icon = "fa fa-save"
)
Phylogeny IQtree:
Based on the phylogeny, B. hookeri and B. hybrid are embedded within B. sagittata.
### vcf to nexus (binary) for PhyloNet
python vcf2phylip -i basa_outg.vcf -o LW -b
bash-4.2$ python vcf2phylip.py --i basa_15_50_70.recode.vcf
python vcf2phylip.py --i basa.vcf --phy
./iqtree -s basa.phy -nt 10 -m GTR+ASC -bb 1000
Fbranch:
Based on Fbranch results (very conservative parameters) there is gene flow among some B. sagittata populations and B. hookeri and B. hybrid. There is also some ancestral gene flow among B. sagittata and basal clades of B. hookeri and B. hybrid.
./Dsuite Dtrios basa.vcf SETS.txt -t tree_trimmed.tree
./Dsuite Fbranch -Z --Pb-matrix tree_trimmed.tree SETS_tree.txt > fbranch_collapsed_p.txt
head -n 79 fbranch_collapsed_p.txt > f_plotting.txt
#read in z-scores
zscores <- read.csv("f_plotting.txt", sep = "\t")
zscores[zscores < 0.05] <- 0
write.table(zscores, "f_5_percent_plotting_trimmed.txt", sep='\t',row.names=F, quote=F)
./dtools.py f_5_percent_plotting_trimmed.txt tree_trimmed.tree --outgroup LW_ME_2 --ladderize
PhyloNet:
Three reticulations so far. Still running with 4 iterations. Then see what network is the most optimal.
### Trimming the vcf:
vcftools --vcf basa_final.vcf --remove remove.txt (a single individual per population, except for the outgroups)
Running PhyloNet from 0 to 10 networks:
# nexus file, each sample with a single letter different ID
# First 0 network, then 1 network, 2 etc.. based on the former network (see example file), 10 iterations
java -jar PhyloNet.jar phylonet_todos0.nex > results.txt